'''
Created on 2011-01-06

@author: Hanna Dutkiewicz
'''
from HTMLParser import HTMLParser
from nltk.tokenize.treebank import TreebankWordTokenizer

'''
Klasa parsera HTML wyodrebniajaca metadane
'''
class MetaParser(HTMLParser):

    tokenizer = TreebankWordTokenizer()
    
    def __init__(self, content, theDocument):
        HTMLParser.__init__(self)
        self.document = theDocument
        self.isTitle = 0
        self.feed(content)
    
    def getChangedDocument(self):
        return self.document
    
    def handle_starttag(self, tag, attrs):
#        print "Encountered the beginning of a %s tag" % tag
        if self.isTitle == 1:
            self.document.addTitleWord(tag)
        elif tag == 'title':
            self.isTitle = 1
        elif tag == 'meta':
            for i in attrs:
                if i[0] == 'value':
                    metawords = self.tokenizer.tokenize(i[1])
                    self.document.addMetadataWords(metawords)
                    break;


    def handle_endtag(self, tag):
#        print "Encountered the end of a %s tag" % tag
        if tag == 'title':
            self.isTitle = 0

        
    def handle_data(self, data):
        if self.isTitle == 1:
            fileWords = self.tokenizer.tokenize(data)
            self.document.addTitleWords(fileWords)
            
    
            
